{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-04-28T14:23:46.655980Z",
     "start_time": "2021-04-28T14:23:45.926568Z"
    },
    "id": "4AA5668B2F574911B6C76E41E143BDEF",
    "jupyter": {},
    "scrolled": false,
    "slideshow": {
     "slide_type": "slide"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "import warnings\n",
    "import gc\n",
    "import os\n",
    "import lightgbm as lgb\n",
    "from sklearn.model_selection import StratifiedKFold\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from gensim.models import Word2Vec\n",
    "from collections import OrderedDict\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.metrics import roc_auc_score\n",
    "import time\n",
    "from itertools import combinations\n",
    "\n",
    "pd.set_option('display.max_columns', None)\n",
    "pd.set_option('display.max_rows', None)\n",
    "\n",
    "warnings.filterwarnings('ignore')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-04-28T14:23:46.659536Z",
     "start_time": "2021-04-28T14:23:46.657559Z"
    },
    "id": "760ABA20B7B7478C82CC9C50D5C81CB9",
    "jupyter": {},
    "notebookId": "60881e4fbb1fc90018675604",
    "scrolled": false,
    "slideshow": {
     "slide_type": "slide"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "seed = 2021"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-04-28T14:23:51.465721Z",
     "start_time": "2021-04-28T14:23:46.660592Z"
    },
    "id": "7B41CECDA2B7418F862AEF5346103031",
    "jupyter": {},
    "scrolled": false,
    "slideshow": {
     "slide_type": "slide"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "df_train = pd.read_csv('/home/mw/input/pre8881/train.csv')\n",
    "df_test = pd.read_csv('/home/mw/input/pretest_a3048/test_a.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-04-28T14:23:51.475018Z",
     "start_time": "2021-04-28T14:23:51.467257Z"
    },
    "id": "1041E7F5204B43DB885E1EADAA56669E",
    "jupyter": {},
    "scrolled": false,
    "slideshow": {
     "slide_type": "slide"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "df_train.shape, df_test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-04-28T14:23:52.049755Z",
     "start_time": "2021-04-28T14:23:51.476567Z"
    },
    "id": "0164F7942CA04865822AE48A5F351390",
    "jupyter": {},
    "notebookId": "60881e4fbb1fc90018675604",
    "scrolled": false,
    "slideshow": {
     "slide_type": "slide"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "df_feature = df_train.append(df_test, sort=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-04-28T14:23:52.097660Z",
     "start_time": "2021-04-28T14:23:52.051164Z"
    },
    "id": "769C1C13ADB244DCB0CB1D1C4A5DF522",
    "jupyter": {},
    "scrolled": false,
    "slideshow": {
     "slide_type": "slide"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "df_feature.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-04-28T14:23:52.107115Z",
     "start_time": "2021-04-28T14:23:52.100602Z"
    }
   },
   "outputs": [],
   "source": [
    "df_feature['tp_ratio'] = df_feature['nprem_tp'] / df_feature['si_tp']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-04-28T14:24:19.500082Z",
     "start_time": "2021-04-28T14:23:52.108544Z"
    }
   },
   "outputs": [],
   "source": [
    "# 计数\n",
    "for f in [['dpt'], ['client_no'], ['trademark_cn'], ['brand_cn'], ['make_cn'], ['series']]:\n",
    "    df_temp = df_feature.groupby(f).size().reset_index()\n",
    "    df_temp.columns = f + ['{}_count'.format('_'.join(f))]\n",
    "    df_feature = df_feature.merge(df_temp, how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-04-28T14:24:19.912533Z",
     "start_time": "2021-04-28T14:24:19.501393Z"
    },
    "id": "2C0A8CBFDC414C8993D60075871CDD30",
    "jupyter": {},
    "notebookId": "60881e4fbb1fc90018675604",
    "scrolled": false,
    "slideshow": {
     "slide_type": "slide"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "df_feature['birth_month'] = df_feature['birth_month'].apply(\n",
    "    lambda x: int(x[:-1]) if type(x) != float else 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-04-28T14:24:19.918591Z",
     "start_time": "2021-04-28T14:24:19.913888Z"
    }
   },
   "outputs": [],
   "source": [
    "# 简单统计\n",
    "def stat(df, df_merge, group_by, agg):\n",
    "    group = df.groupby(group_by).agg(agg)\n",
    "\n",
    "    columns = []\n",
    "    for on, methods in agg.items():\n",
    "        for method in methods:\n",
    "            columns.append('{}_{}_{}'.format('_'.join(group_by), on, method))\n",
    "    group.columns = columns\n",
    "    group.reset_index(inplace=True)\n",
    "    df_merge = df_merge.merge(group, on=group_by, how='left')\n",
    "\n",
    "    del (group)\n",
    "    gc.collect()\n",
    "\n",
    "    return df_merge\n",
    "\n",
    "\n",
    "def statis_feat(df_know, df_unknow):\n",
    "    for f in tqdm(['p1_census_register', 'dpt']):\n",
    "        df_unknow = stat(df_know, df_unknow, [f], {\n",
    "                         'y1_is_purchase': ['mean']})\n",
    "\n",
    "    return df_unknow"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-04-28T14:24:32.066233Z",
     "start_time": "2021-04-28T14:24:19.919591Z"
    },
    "code_folding": [
     1
    ]
   },
   "outputs": [],
   "source": [
    "# 5折交叉\n",
    "df_train = df_feature[~df_feature['y1_is_purchase'].isnull()]\n",
    "df_train = df_train.reset_index(drop=True)\n",
    "df_test = df_feature[df_feature['y1_is_purchase'].isnull()]\n",
    "\n",
    "df_stas_feat = None\n",
    "kfold = StratifiedKFold(n_splits=5, random_state=seed, shuffle=True)\n",
    "for train_index, val_index in kfold.split(df_train, df_train['y1_is_purchase']):\n",
    "    df_fold_train = df_train.iloc[train_index]\n",
    "    df_fold_val = df_train.iloc[val_index]\n",
    "\n",
    "    df_fold_val = statis_feat(df_fold_train, df_fold_val)\n",
    "    df_stas_feat = pd.concat([df_stas_feat, df_fold_val], axis=0)\n",
    "\n",
    "    del(df_fold_train)\n",
    "    del(df_fold_val)\n",
    "    gc.collect()\n",
    "\n",
    "df_test = statis_feat(df_train, df_test)\n",
    "df_feature = pd.concat([df_stas_feat, df_test], axis=0)\n",
    "\n",
    "del(df_stas_feat)\n",
    "del(df_train)\n",
    "del(df_test)\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-04-28T14:24:32.113010Z",
     "start_time": "2021-04-28T14:24:32.067408Z"
    }
   },
   "outputs": [],
   "source": [
    "df_feature.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 模型训练"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-04-28T14:24:39.717353Z",
     "start_time": "2021-04-28T14:24:32.114095Z"
    },
    "id": "EF950A935E6B41AB8F1E0CEC03DF8CCA",
    "jupyter": {},
    "notebookId": "60881e4fbb1fc90018675604",
    "scrolled": false,
    "slideshow": {
     "slide_type": "slide"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "for f in list(df_feature.select_dtypes('object')):\n",
    "    if f in ['carid', 'regdate']:\n",
    "        continue\n",
    "    le = LabelEncoder()\n",
    "    df_feature[f] = le.fit_transform(\n",
    "        df_feature[f].astype('str')).astype('int')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-04-28T14:24:40.023172Z",
     "start_time": "2021-04-28T14:24:39.718886Z"
    },
    "id": "784C1D25FDC94691AD7C71DD61336944",
    "jupyter": {},
    "notebookId": "60881e4fbb1fc90018675604",
    "scrolled": false,
    "slideshow": {
     "slide_type": "slide"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "df_train = df_feature[df_feature['y1_is_purchase'].notnull()]\n",
    "df_test = df_feature[df_feature['y1_is_purchase'].isnull()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-04-28T14:40:28.582706Z",
     "start_time": "2021-04-28T14:24:40.024569Z"
    },
    "id": "B5D42123DC18447D8817A94ADD49B2AB",
    "jupyter": {},
    "notebookId": "60881e4fbb1fc90018675604",
    "scrolled": true,
    "slideshow": {
     "slide_type": "slide"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "ycol = 'y1_is_purchase'\n",
    "feature_names = list(\n",
    "    filter(lambda x: x not in [ycol, 'regdate', 'carid'], df_train.columns))\n",
    "\n",
    "model = lgb.LGBMClassifier(num_leaves=64,\n",
    "                           max_depth=10,\n",
    "                           learning_rate=0.01,\n",
    "                           n_estimators=10000,\n",
    "                           subsample=0.8,\n",
    "                           feature_fraction=0.8,\n",
    "                           reg_alpha=0.5,\n",
    "                           reg_lambda=0.5,\n",
    "                           random_state=seed,\n",
    "                           metric=None)\n",
    "\n",
    "oof = []\n",
    "prediction = df_test[['carid']]\n",
    "prediction['label'] = 0\n",
    "df_importance_list = []\n",
    "\n",
    "kfold = StratifiedKFold(n_splits=5, random_state=seed, shuffle=True)\n",
    "for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(\n",
    "        df_train[feature_names], df_train[ycol])):\n",
    "    X_train = df_train.iloc[trn_idx][feature_names]\n",
    "    Y_train = df_train.iloc[trn_idx][ycol]\n",
    "\n",
    "    X_val = df_train.iloc[val_idx][feature_names]\n",
    "    Y_val = df_train.iloc[val_idx][ycol]\n",
    "\n",
    "    print('\\nFold_{} Training ================================\\n'.format(fold_id+1))\n",
    "\n",
    "    lgb_model = model.fit(X_train,\n",
    "                          Y_train,\n",
    "                          eval_names=['valid'],\n",
    "                          eval_set=[(X_val, Y_val)],\n",
    "                          verbose=500,\n",
    "                          eval_metric='auc',\n",
    "                          early_stopping_rounds=50)\n",
    "\n",
    "    pred_val = lgb_model.predict_proba(\n",
    "        X_val, num_iteration=lgb_model.best_iteration_)[:, 1]\n",
    "    df_oof = df_train.iloc[val_idx][[\n",
    "        'carid', ycol]].copy()\n",
    "    df_oof['pred'] = pred_val\n",
    "    oof.append(df_oof)\n",
    "\n",
    "    pred_test = lgb_model.predict_proba(\n",
    "        df_test[feature_names], num_iteration=lgb_model.best_iteration_)[:, 1]\n",
    "    prediction['label'] += pred_test / 5\n",
    "\n",
    "    df_importance = pd.DataFrame({\n",
    "        'column': feature_names,\n",
    "        'importance': lgb_model.feature_importances_,\n",
    "    })\n",
    "    df_importance_list.append(df_importance)\n",
    "\n",
    "    del lgb_model, pred_val, pred_test, X_train, Y_train, X_val, Y_val\n",
    "    gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-04-28T14:40:28.597051Z",
     "start_time": "2021-04-28T14:40:28.583851Z"
    },
    "id": "25F80E8277D0459495CFBA4A007964EB",
    "jupyter": {},
    "notebookId": "60881e4fbb1fc90018675604",
    "scrolled": true,
    "slideshow": {
     "slide_type": "slide"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "df_importance = pd.concat(df_importance_list)\n",
    "df_importance = df_importance.groupby(['column'])['importance'].agg(\n",
    "    'mean').sort_values(ascending=False).reset_index()\n",
    "df_importance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-04-28T14:40:28.862348Z",
     "start_time": "2021-04-28T14:40:28.598023Z"
    },
    "id": "F9ABEBA707364ED3834E0E074CA2F989",
    "jupyter": {},
    "notebookId": "60881e4fbb1fc90018675604",
    "scrolled": false,
    "slideshow": {
     "slide_type": "slide"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "df_oof = pd.concat(oof)\n",
    "score = roc_auc_score(df_oof['y1_is_purchase'], df_oof['pred'])\n",
    "score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-04-28T14:40:28.865884Z",
     "start_time": "2021-04-28T14:40:28.863367Z"
    }
   },
   "outputs": [],
   "source": [
    "score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-04-28T14:54:44.470340Z",
     "start_time": "2021-04-28T14:54:44.455375Z"
    },
    "id": "FE1D582019674017823D0BA541210420",
    "jupyter": {},
    "notebookId": "60881e4fbb1fc90018675604",
    "scrolled": false,
    "slideshow": {
     "slide_type": "slide"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "df_oof.head(20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-04-28T14:40:28.925397Z",
     "start_time": "2021-04-28T14:40:28.875204Z"
    }
   },
   "outputs": [],
   "source": [
    "prediction.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2021-04-28T14:40:29.825598Z",
     "start_time": "2021-04-28T14:40:28.926460Z"
    }
   },
   "outputs": [],
   "source": [
    "os.makedirs('sub', exist_ok=True)\n",
    "prediction.to_csv(f'sub/{score}.csv', index=False)\n",
    "prediction.to_csv(f'sub/sub.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:dm] *",
   "language": "python",
   "name": "conda-env-dm-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
